12wk-1: ???

plotly
Author

최규빈

Published

November 20, 2023

1. 강의영상

2. Imports

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import requests
import json 
pd.options.plotting.backend = "plotly"
pio.templates.default = "plotly_white"
us_dict = json.loads(requests.get('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json').text)
newyork_dict = us_dict.copy()
newyork_dict['features'] = [l for l in us_dict['features'] if "New York" in l['properties']['NAME']]

3.

df = pd.read_csv("NYCTaxi.csv")[::100].reset_index(drop=True)
df.columns
Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')
df
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id3194108 1 2016-06-01 11:48:41 2016-06-01 12:19:07 1 -74.005028 40.746452 -73.972008 40.745781 N 1826
2 id3564028 1 2016-01-02 01:16:42 2016-01-02 01:19:56 1 -73.954132 40.774784 -73.947418 40.779633 N 194
3 id1660823 2 2016-03-01 06:40:18 2016-03-01 07:01:37 5 -73.982140 40.775326 -74.009850 40.721699 N 1279
4 id1575277 2 2016-06-11 16:59:15 2016-06-11 17:33:27 1 -73.999229 40.722881 -73.982880 40.778297 N 2052
... ... ... ... ... ... ... ... ... ... ... ...
14582 id3647353 1 2016-05-16 22:12:09 2016-05-16 22:27:46 1 -73.990219 40.737076 -73.986748 40.702194 N 937
14583 id2064944 1 2016-05-23 08:04:35 2016-05-23 08:19:20 1 -73.987068 40.730728 -73.974983 40.751331 N 885
14584 id3286731 2 2016-05-31 16:56:13 2016-05-31 17:38:44 1 -73.863541 40.769711 -73.994644 40.750435 N 2551
14585 id3453691 2 2016-03-07 18:11:54 2016-03-07 18:29:09 1 -74.006531 40.738232 -73.985970 40.726978 N 1035
14586 id0995846 2 2016-05-09 17:26:56 2016-05-09 18:30:37 2 -73.789543 40.647099 -73.960320 40.798180 N 3821

14587 rows × 11 columns

fig = px.scatter_mapbox(
    data_frame=df,
    lat='pickup_latitude',
    lon='pickup_longitude',
    opacity=0.3,
    center={'lat': 40.7322, 'lon': -73.9052},
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.update_traces(
    marker={'size':2}
)
fig.show(config={'scrollZoom': False})
fig = px.density_mapbox(
    data_frame=df,
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    radius = 1,
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.show(config={'scrollZoom': False})
fig = px.density_mapbox(
    data_frame=df,
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    radius = 2,
    z='passenger_count',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.show(config={'scrollZoom': False})
df.assign(log_trip_duration = lambda df: np.log(df.trip_duration))
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration log_trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455 6.120297
1 id3194108 1 2016-06-01 11:48:41 2016-06-01 12:19:07 1 -74.005028 40.746452 -73.972008 40.745781 N 1826 7.509883
2 id3564028 1 2016-01-02 01:16:42 2016-01-02 01:19:56 1 -73.954132 40.774784 -73.947418 40.779633 N 194 5.267858
3 id1660823 2 2016-03-01 06:40:18 2016-03-01 07:01:37 5 -73.982140 40.775326 -74.009850 40.721699 N 1279 7.153834
4 id1575277 2 2016-06-11 16:59:15 2016-06-11 17:33:27 1 -73.999229 40.722881 -73.982880 40.778297 N 2052 7.626570
... ... ... ... ... ... ... ... ... ... ... ... ...
14582 id3647353 1 2016-05-16 22:12:09 2016-05-16 22:27:46 1 -73.990219 40.737076 -73.986748 40.702194 N 937 6.842683
14583 id2064944 1 2016-05-23 08:04:35 2016-05-23 08:19:20 1 -73.987068 40.730728 -73.974983 40.751331 N 885 6.785588
14584 id3286731 2 2016-05-31 16:56:13 2016-05-31 17:38:44 1 -73.863541 40.769711 -73.994644 40.750435 N 2551 7.844241
14585 id3453691 2 2016-03-07 18:11:54 2016-03-07 18:29:09 1 -74.006531 40.738232 -73.985970 40.726978 N 1035 6.942157
14586 id0995846 2 2016-05-09 17:26:56 2016-05-09 18:30:37 2 -73.789543 40.647099 -73.960320 40.798180 N 3821 8.248267

14587 rows × 12 columns

fig = px.density_mapbox(
    data_frame=df.assign(log_trip_duration = lambda df: np.log(df.trip_duration)),
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    radius = 1.5,
    z='log_trip_duration',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.show(config={'scrollZoom': False})
df.assign(alone = df.passenger_count == 1).assign(vendor_id = lambda df: df.vendor_id.astype(str))
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration alone
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455 True
1 id3194108 1 2016-06-01 11:48:41 2016-06-01 12:19:07 1 -74.005028 40.746452 -73.972008 40.745781 N 1826 True
2 id3564028 1 2016-01-02 01:16:42 2016-01-02 01:19:56 1 -73.954132 40.774784 -73.947418 40.779633 N 194 True
3 id1660823 2 2016-03-01 06:40:18 2016-03-01 07:01:37 5 -73.982140 40.775326 -74.009850 40.721699 N 1279 False
4 id1575277 2 2016-06-11 16:59:15 2016-06-11 17:33:27 1 -73.999229 40.722881 -73.982880 40.778297 N 2052 True
... ... ... ... ... ... ... ... ... ... ... ... ...
14582 id3647353 1 2016-05-16 22:12:09 2016-05-16 22:27:46 1 -73.990219 40.737076 -73.986748 40.702194 N 937 True
14583 id2064944 1 2016-05-23 08:04:35 2016-05-23 08:19:20 1 -73.987068 40.730728 -73.974983 40.751331 N 885 True
14584 id3286731 2 2016-05-31 16:56:13 2016-05-31 17:38:44 1 -73.863541 40.769711 -73.994644 40.750435 N 2551 True
14585 id3453691 2 2016-03-07 18:11:54 2016-03-07 18:29:09 1 -74.006531 40.738232 -73.985970 40.726978 N 1035 True
14586 id0995846 2 2016-05-09 17:26:56 2016-05-09 18:30:37 2 -73.789543 40.647099 -73.960320 40.798180 N 3821 False

14587 rows × 12 columns

fig = px.scatter_mapbox(
    data_frame=df.assign(alone = df.passenger_count == 1).assign(vendor_id = lambda df: df.vendor_id.astype(str)),
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    opacity = 0.3,
    center = {'lat':40.7322, 'lon':-73.9052},
    color = 'vendor_id',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.update_traces(
    marker={'size':2}
)
fig.show(config={'scrollZoom': False})
# df.pickup_datetime.str.split(' ').str[-1].str.split(':').str[0].apply(int)
# df.pickup_datetime.apply(pd.to_datetime).dt.hour
tidydata = df.assign(
    alone = df.passenger_count == 1,
    hour = df.pickup_datetime.apply(pd.to_datetime).dt.hour,
    vendor_id = df.vendor_id.astype(str),
    log_trip_duration = np.log(df.trip_duration)>8
).sort_values(by='hour')
fig = px.scatter_mapbox(
    data_frame=tidydata,
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    color = 'vendor_id',
    size = 'passenger_count',
    size_max = 5,
    animation_frame = 'hour',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.show(config={'scrollZoom': False})
tidydata\
.groupby('vendor_id').agg({'passenger_count':'mean'})\
.reset_index()\
.plot.bar(x='vendor_id',y='passenger_count',color='vendor_id',text='passenger_count')
tidydata.log_trip_duration.min(), tidydata.log_trip_duration.max()
(False, True)
fig = px.scatter_mapbox(
    data_frame=tidydata,
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    color = 'log_trip_duration',
    size = 'passenger_count',    
    size_max = 5,
    animation_frame = 'hour',
    range_color = (tidydata.log_trip_duration.min(), tidydata.log_trip_duration.max()),
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600
)
fig.show(config={'scrollZoom': False})
df_small = df[::100].reset_index(drop=True)
df_small
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id3667993 2 2016-01-03 04:18:57 2016-01-03 04:27:03 1 -73.980522 40.730530 -73.997993 40.746220 N 486
2 id2002463 2 2016-01-14 12:28:56 2016-01-14 12:37:17 1 -73.965652 40.768398 -73.960068 40.779308 N 501
3 id1635353 2 2016-03-04 23:20:58 2016-03-04 23:49:29 5 -73.985092 40.759190 -73.962151 40.709850 N 1711
4 id1850636 1 2016-02-05 00:21:28 2016-02-05 00:52:24 1 -73.994537 40.750439 -74.025719 40.631100 N 1856
... ... ... ... ... ... ... ... ... ... ... ...
141 id0621879 1 2016-04-23 09:31:33 2016-04-23 09:51:33 1 -73.950783 40.743614 -74.006218 40.722729 N 1200
142 id2587483 2 2016-03-28 12:59:58 2016-03-28 13:08:11 2 -73.953903 40.787079 -73.940842 40.792461 N 493
143 id1030598 2 2016-03-03 11:44:24 2016-03-03 11:49:59 1 -74.005066 40.719143 -74.006065 40.735134 N 335
144 id3094934 1 2016-03-21 09:53:40 2016-03-21 10:22:20 1 -73.986153 40.722431 -73.985977 40.762669 N 1720
145 id0503659 2 2016-04-19 18:06:09 2016-04-19 18:23:09 2 -73.952209 40.784500 -73.966103 40.804832 N 1020

146 rows × 11 columns

def transform(df):
    pick_up = df.loc[:,['id','pickup_datetime','pickup_longitude','pickup_latitude']].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "pickup"')
    drop_off = df.loc[:,['id','dropoff_datetime','dropoff_longitude','dropoff_latitude']].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "dropoff"')
    return pd.concat([pick_up,drop_off],axis=0).reset_index(drop=True)
pd.concat([transform(df) for i,df in df_small.groupby('id')]).reset_index(drop=True)\
.merge(df_small.drop(['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_datetime','dropoff_longitude','dropoff_latitude'],axis=1))
id datetime lon lat state vendor_id passenger_count store_and_fwd_flag trip_duration
0 id0037819 2016-05-16 17:42:32 -73.986420 40.756569 pickup 2 6 N 273
1 id0037819 2016-05-16 17:47:05 -73.995300 40.740059 dropoff 2 6 N 273
2 id0049607 2016-03-13 18:48:49 -73.975922 40.754192 pickup 1 2 N 439
3 id0049607 2016-03-13 18:56:08 -73.988922 40.762859 dropoff 1 2 N 439
4 id0051866 2016-01-04 18:48:12 -73.962654 40.772449 pickup 1 1 N 638
... ... ... ... ... ... ... ... ... ...
287 id3825370 2016-05-08 17:36:48 -73.979195 40.669765 dropoff 1 4 N 2358
288 id3888107 2016-06-21 18:30:05 -73.969429 40.757469 pickup 2 1 N 878
289 id3888107 2016-06-21 18:44:43 -73.982742 40.771969 dropoff 2 1 N 878
290 id3988208 2016-03-01 21:40:13 -73.948929 40.797405 pickup 1 1 N 433
291 id3988208 2016-03-01 21:47:26 -73.967438 40.789543 dropoff 1 1 N 433

292 rows × 9 columns

df2 = pd.concat([transform(df) for i,df in df_small.groupby('id')]).reset_index(drop=True)\
.merge(df_small.drop(['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_datetime','dropoff_longitude','dropoff_latitude'],axis=1))
df2
id datetime lon lat state vendor_id passenger_count store_and_fwd_flag trip_duration
0 id0037819 2016-05-16 17:42:32 -73.986420 40.756569 pickup 2 6 N 273
1 id0037819 2016-05-16 17:47:05 -73.995300 40.740059 dropoff 2 6 N 273
2 id0049607 2016-03-13 18:48:49 -73.975922 40.754192 pickup 1 2 N 439
3 id0049607 2016-03-13 18:56:08 -73.988922 40.762859 dropoff 1 2 N 439
4 id0051866 2016-01-04 18:48:12 -73.962654 40.772449 pickup 1 1 N 638
... ... ... ... ... ... ... ... ... ...
287 id3825370 2016-05-08 17:36:48 -73.979195 40.669765 dropoff 1 4 N 2358
288 id3888107 2016-06-21 18:30:05 -73.969429 40.757469 pickup 2 1 N 878
289 id3888107 2016-06-21 18:44:43 -73.982742 40.771969 dropoff 2 1 N 878
290 id3988208 2016-03-01 21:40:13 -73.948929 40.797405 pickup 1 1 N 433
291 id3988208 2016-03-01 21:47:26 -73.967438 40.789543 dropoff 1 1 N 433

292 rows × 9 columns

df2.assign(
    alone = lambda df: df.passenger_count == 1,
    hour = lambda df: df.datetime.apply(pd.to_datetime).dt.hour
)
id datetime lon lat state vendor_id passenger_count store_and_fwd_flag trip_duration alone hour
0 id0037819 2016-05-16 17:42:32 -73.986420 40.756569 pickup 2 6 N 273 False 17
1 id0037819 2016-05-16 17:47:05 -73.995300 40.740059 dropoff 2 6 N 273 False 17
2 id0049607 2016-03-13 18:48:49 -73.975922 40.754192 pickup 1 2 N 439 False 18
3 id0049607 2016-03-13 18:56:08 -73.988922 40.762859 dropoff 1 2 N 439 False 18
4 id0051866 2016-01-04 18:48:12 -73.962654 40.772449 pickup 1 1 N 638 True 18
... ... ... ... ... ... ... ... ... ... ... ...
287 id3825370 2016-05-08 17:36:48 -73.979195 40.669765 dropoff 1 4 N 2358 False 17
288 id3888107 2016-06-21 18:30:05 -73.969429 40.757469 pickup 2 1 N 878 True 18
289 id3888107 2016-06-21 18:44:43 -73.982742 40.771969 dropoff 2 1 N 878 True 18
290 id3988208 2016-03-01 21:40:13 -73.948929 40.797405 pickup 1 1 N 433 True 21
291 id3988208 2016-03-01 21:47:26 -73.967438 40.789543 dropoff 1 1 N 433 True 21

292 rows × 11 columns

fig = px.line_mapbox(
    df2.assign(alone = lambda df: df.passenger_count == 1, hour = lambda df: df.datetime.apply(pd.to_datetime).dt.hour).sort_values('hour'), 
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='alone',
    hover_data = 'hour',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=800,
    height=600,
)
fig.add_trace(
    px.scatter_mapbox(
        data_frame=df2.assign(alone = lambda df: df.passenger_count == 1, hour = lambda df: df.datetime.apply(pd.to_datetime).dt.hour).sort_values('hour'),
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'alone',
        size = 'trip_duration',
        size_max = 10,
    ).data[0]
)
fig.add_trace(
    px.scatter_mapbox(
        data_frame=df2.assign(alone = lambda df: df.passenger_count == 1, hour = lambda df: df.datetime.apply(pd.to_datetime).dt.hour).sort_values('hour'),
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'alone',
        size = 'trip_duration',
        size_max = 10,
    ).data[1]
)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})